packages <- c(
"tidyverse", "ggplot2", "dplyr", "readr", "lme4",
"sjPlot", "gridExtra", "viridis", "plotly",
"knitr", "kableExtra", "readxl", "tidyr"
)
# Installing missing packages
install_if_missing <- function(packages) {
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if (length(new_packages)) {
install.packages(new_packages)
}
}
install_if_missing(packages)
# Loading all packages
lapply(packages, library, character.only = TRUE)
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "lme4" "Matrix" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[6]]
## [1] "sjPlot" "lme4" "Matrix" "lubridate" "forcats" "stringr"
## [7] "dplyr" "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [13] "tidyverse" "stats" "graphics" "grDevices" "utils" "datasets"
## [19] "methods" "base"
##
## [[7]]
## [1] "gridExtra" "sjPlot" "lme4" "Matrix" "lubridate" "forcats"
## [7] "stringr" "dplyr" "purrr" "readr" "tidyr" "tibble"
## [13] "ggplot2" "tidyverse" "stats" "graphics" "grDevices" "utils"
## [19] "datasets" "methods" "base"
##
## [[8]]
## [1] "viridis" "viridisLite" "gridExtra" "sjPlot" "lme4"
## [6] "Matrix" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[9]]
## [1] "plotly" "viridis" "viridisLite" "gridExtra" "sjPlot"
## [6] "lme4" "Matrix" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "knitr" "plotly" "viridis" "viridisLite" "gridExtra"
## [6] "sjPlot" "lme4" "Matrix" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "kableExtra" "knitr" "plotly" "viridis" "viridisLite"
## [6] "gridExtra" "sjPlot" "lme4" "Matrix" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[12]]
## [1] "readxl" "kableExtra" "knitr" "plotly" "viridis"
## [6] "viridisLite" "gridExtra" "sjPlot" "lme4" "Matrix"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[13]]
## [1] "readxl" "kableExtra" "knitr" "plotly" "viridis"
## [6] "viridisLite" "gridExtra" "sjPlot" "lme4" "Matrix"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
hiv_data_path <- "C:/Users/PC/Desktop/Internship_task_CEMA/internship_task_dscience-main/HIV data 2000-2023.csv"
poverty_data_path <- "C:/Users/PC/Desktop/Internship_task_CEMA/internship_task_dscience-main/multidimensional_poverty.xlsx"
hiv_data <- read_tsv(hiv_data_path)
poverty_data <- read_excel(poverty_data_path, skip = 2)
glimpse(hiv_data)
## Rows: 1,552
## Columns: 1
## $ `IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,Value` <chr> …
glimpse(poverty_data)
## Rows: 110
## Columns: 16
## $ ...1 <chr> "SSA", "ECA", "LAC", "ECA", "EAP", "ECA",…
## $ ...2 <chr> "AGO", "ALB", "ARG", "ARM", "AUS", "AUT",…
## $ ...3 <chr> "Angola", "Albania", "Argentina", "Armeni…
## $ ...4 <dbl> 2018, 2012, 2010, 2010, 2010, 2009, 2013,…
## $ ...5 <chr> "IDREA", "HBS", "EPHC-S2", "ILCS", "SIH-L…
## $ ...6 <dbl> 2018, 2018, 2021, 2021, 2018, 2022, 2020,…
## $ ...7 <chr> "N", "N", "U", "N", "N", "N", "N", "N", "…
## $ ...8 <chr> "c", "c", "i", "c", "I", "i", "c", "i", "…
## $ ...9 <dbl> 2, 1, 3, 1, 3, 2, 1, 2, 1, 3, 2, 5, 1, 5,…
## $ `Monetary (%)` <dbl> 31.122004986, 0.048107048, 0.894217938, 0…
## $ `Educational attainment (%)` <chr> "29.7534227371215", "0.192380091175436", …
## $ `Educational enrollment (%)` <chr> "27.443060278892499", "-", "0.73135080747…
## $ `Electricity (%)` <chr> "52.639532089233398", "6.0250010574236498…
## $ `Sanitation (%)` <chr> "53.637516498565596", "6.5797723829746202…
## $ `Drinking water (%)` <chr> "32.106507280141003", "9.5949657452627903…
## $ ...16 <dbl> 47.20360637, 0.29316142, 0.90657296, 0.52…
Let’s clean and prepare the HIV dataset:
hiv_data_clean <- hiv_data %>%
# Spliting the single column into multiple columns based on commas
separate(`IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,Value`,
into = c("indicator_code", "indicator", "value_type", "parent_location_code",
"parent_location", "location_type", "spatial_dim_value_code", "country",
"period_type", "year", "value"),
sep = ",", extra = "drop", fill = "right") %>%
# Cleaning up invalid UTF-8 characters
mutate(across(everything(), ~ stringi::stri_enc_toascii(.))) %>%
# Extracting the numeric values from the 'value' column
mutate(
hiv_cases = as.numeric(gsub(" .*", "", value)),
# Converting "No data" and similar entries to NA
hiv_cases = ifelse(grepl("No data|<", value), NA, hiv_cases),
# Converting to thousands for easier analysis
hiv_cases = hiv_cases * 1000
) %>%
# Filtering out rows with NA values for HIV cases
filter(!is.na(hiv_cases)) %>%
# Selecting relevant columns
select(
country = country,
year = year,
hiv_cases,
who_region = parent_location_code
) %>%
# Converting years to numeric
mutate(year = as.numeric(year)) %>%
# Filtering data for the years of interest (2000-2023)
filter(year >= 2000 & year <= 2023)
# Displays the structure of the cleaned data
print("Structure of cleaned HIV data:")
## [1] "Structure of cleaned HIV data:"
glimpse(hiv_data_clean)
## Rows: 1,065
## Columns: 4
## $ country <chr> "Angola", "Angola", "Angola", "Angola", "Angola", "Angola",…
## $ year <dbl> 2023, 2022, 2021, 2020, 2015, 2010, 2005, 2000, 2023, 2022,…
## $ hiv_cases <dbl> 320000, 320000, 320000, 320000, 300000, 250000, 190000, 140…
## $ who_region <chr> "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AF…
# Displays first few rows of the cleaned data
print("First few rows of cleaned HIV data:")
## [1] "First few rows of cleaned HIV data:"
head(hiv_data_clean)
## # A tibble: 6 Ă— 4
## country year hiv_cases who_region
## <chr> <dbl> <dbl> <chr>
## 1 Angola 2023 320000 AFR
## 2 Angola 2022 320000 AFR
## 3 Angola 2021 320000 AFR
## 4 Angola 2020 320000 AFR
## 5 Angola 2015 300000 AFR
## 6 Angola 2010 250000 AFR
poverty_data_clean <- poverty_data %>%
# Select and rename columns using the numbered system
select(
region = `...1`,
country_code = `...2`,
country = `...3`,
reporting_year = `...4`,
survey_name = `...5`,
survey_year = `...6`,
survey_coverage = `...7`,
welfare_type = `...8`,
survey_comparability = `...9`,
monetary = `Monetary (%)`,
education_attainment = `Educational attainment (%)`,
school_enrollment = `Educational enrollment (%)`,
electricity = `Electricity (%)`,
sanitation = `Sanitation (%)`,
drinking_water = `Drinking water (%)`,
poverty_ratio = `...16`
) %>%
# Convert "-" to NA, then convert to numeric
mutate(
across(c(reporting_year, survey_year), as.numeric),
across(c(monetary, education_attainment, school_enrollment,
electricity, sanitation, drinking_water, poverty_ratio),
~ifelse(. == "-", NA, as.numeric(.)))
)
# Displays the structure of the cleaned data
glimpse(poverty_data_clean)
## Rows: 110
## Columns: 16
## $ region <chr> "SSA", "ECA", "LAC", "ECA", "EAP", "ECA", "SSA", …
## $ country_code <chr> "AGO", "ALB", "ARG", "ARM", "AUS", "AUT", "BDI", …
## $ country <chr> "Angola", "Albania", "Argentina", "Armenia", "Aus…
## $ reporting_year <dbl> 2018, 2012, 2010, 2010, 2010, 2009, 2013, 2009, 2…
## $ survey_name <chr> "IDREA", "HBS", "EPHC-S2", "ILCS", "SIH-LIS", "EU…
## $ survey_year <dbl> 2018, 2018, 2021, 2021, 2018, 2022, 2020, 2022, 2…
## $ survey_coverage <chr> "N", "N", "U", "N", "N", "N", "N", "N", "N", "N",…
## $ welfare_type <chr> "c", "c", "i", "c", "I", "i", "c", "i", "c", "c",…
## $ survey_comparability <dbl> 2, 1, 3, 1, 3, 2, 1, 2, 1, 3, 2, 5, 1, 5, 0, 1, 3…
## $ monetary <dbl> 31.122004986, 0.048107048, 0.894217938, 0.5235208…
## $ education_attainment <dbl> 29.75342274, 0.19238009, 1.08531965, 0.00000000, …
## $ school_enrollment <dbl> 27.4430603, NA, 0.7313508, 1.7930038, NA, NA, 34.…
## $ electricity <dbl> 52.63953209, 0.06025001, 0.00000000, 0.00000000, …
## $ sanitation <dbl> 53.6375165, 6.5797724, 0.2574530, 0.3977255, 0.00…
## $ drinking_water <dbl> 32.1065073, 9.5949657, 0.3640480, 0.6600822, NA, …
## $ poverty_ratio <dbl> 47.20360637, 0.29316142, 0.90657296, 0.52352082, …
Now, let’s identify the countries that contribute to 75% of the global HIV burden:
# To calculate total HIV cases per country (across all available years)
country_totals <- hiv_data_clean %>%
# Use the most recent year for each country for better representation
group_by(country) %>%
filter(year == max(year)) %>%
summarize(total_hiv_cases = sum(hiv_cases, na.rm = TRUE)) %>%
arrange(desc(total_hiv_cases))
# Calculate global total HIV cases
global_total <- sum(country_totals$total_hiv_cases)
# Calculate cumulative percentage of global burden
country_totals <- country_totals %>%
mutate(
percentage = total_hiv_cases / global_total * 100,
cum_percentage = cumsum(percentage)
)
# Identify countries contributing to 75% of global burden
high_burden_countries <- country_totals %>%
filter(cum_percentage <= 75) %>%
pull(country)
# Display the high burden countries
cat("Number of countries contributing to 75% of global burden:", length(high_burden_countries), "\n")
## Number of countries contributing to 75% of global burden: 22
print(high_burden_countries)
## [1] "Israel" "Georgia" "Ireland" "Somalia" "Singapore"
## [6] "Tunisia" "Suriname" "Djibouti" "Estonia" "Latvia"
## [11] "Libya" "Denmark" "Mauritania" "Armenia" "Sri Lanka"
## [16] "Bahamas" "Czechia" "Cabo Verde" "Serbia" "Bulgaria"
## [21] "Belize" "Lithuania"
# Create a table showing these countries and their burden
high_burden_table <- country_totals %>%
filter(country %in% high_burden_countries) %>%
select(country, total_hiv_cases, percentage, cum_percentage) %>%
mutate(
percentage = round(percentage, 2),
cum_percentage = round(cum_percentage, 2),
total_hiv_cases = format(total_hiv_cases, big.mark = ",")
)
kable(high_burden_table,
col.names = c("Country", "Total HIV Cases", "% of Global Burden", "Cumulative %"),
caption = "Countries Contributing to 75% of Global HIV Burden") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Country | Total HIV Cases | % of Global Burden | Cumulative % |
|---|---|---|---|
| Israel | 9,600,000 | 5.20 | 5.20 |
| Georgia | 9,000,000 | 4.88 | 10.08 |
| Ireland | 8,600,000 | 4.66 | 14.74 |
| Somalia | 8,200,000 | 4.44 | 19.18 |
| Singapore | 8,100,000 | 4.39 | 23.57 |
| Tunisia | 8,000,000 | 4.33 | 27.90 |
| Suriname | 7,400,000 | 4.01 | 31.91 |
| Djibouti | 7,300,000 | 3.96 | 35.87 |
| Estonia | 7,300,000 | 3.96 | 39.83 |
| Latvia | 6,800,000 | 3.68 | 43.51 |
| Libya | 6,700,000 | 3.63 | 47.14 |
| Denmark | 6,400,000 | 3.47 | 50.61 |
| Mauritania | 6,400,000 | 3.47 | 54.08 |
| Armenia | 6,300,000 | 3.41 | 57.49 |
| Sri Lanka | 4,700,000 | 2.55 | 60.04 |
| Bahamas | 4,100,000 | 2.22 | 62.26 |
| Czechia | 4,100,000 | 2.22 | 64.48 |
| Cabo Verde | 4,000,000 | 2.17 | 66.65 |
| Serbia | 4,000,000 | 2.17 | 68.81 |
| Bulgaria | 3,800,000 | 2.06 | 70.87 |
| Belize | 3,600,000 | 1.95 | 72.82 |
| Lithuania | 3,600,000 | 1.95 | 74.77 |
Let’s visualize the trend of HIV cases in the high burden countries:
# Filter data for high burden countries
high_burden_data <- hiv_data_clean %>%
filter(country %in% high_burden_countries)
# Create trend visualization
hiv_trend_plot <- high_burden_data %>%
ggplot(aes(x = year, y = hiv_cases/1000000, color = country, group = country)) +
geom_line(linewidth = 1) +
geom_point(size = 2, alpha = 0.7) +
scale_y_continuous(labels = scales::comma, name = "People Living with HIV (Millions)") +
scale_color_viridis_d() +
theme_minimal() +
labs(
title = "HIV Cases Trend in Countries Contributing to 75% of Global Burden",
x = "Year",
color = "Country"
) +
theme(
legend.position = "right",
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
panel.grid.minor = element_blank()
)
# Display the plot
print(hiv_trend_plot)
# Make it interactive with plotly
interactive_trend <- ggplotly(hiv_trend_plot)
interactive_trend
# Create faceted plot for clearer country-specific trends
high_burden_data %>%
ggplot(aes(x = year, y = hiv_cases/1000000, color = country)) +
geom_line(linewidth = 1) +
geom_point(size = 1.5) +
facet_wrap(~country, scales = "free_y") +
scale_y_continuous(labels = scales::comma) +
scale_color_viridis_d() +
theme_minimal() +
labs(
title = "HIV Cases Trend by Country (75% of Global Burden)",
x = "Year",
y = "People Living with HIV (Millions)"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
strip.text = element_text(face = "bold"),
strip.background = element_rect(fill = "lightgray", color = NA)
)
## Part 3: Regional HIV Burden Analysis
# Calculate HIV cases by region and year
regional_data <- hiv_data_clean %>%
group_by(who_region, year) %>%
summarize(regional_hiv_cases = sum(hiv_cases, na.rm = TRUE), .groups = 'drop')
# Plot total HIV cases by region over time
regional_trend_plot <- regional_data %>%
ggplot(aes(x = year, y = regional_hiv_cases/1000000, color = who_region)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2) +
scale_y_continuous(labels = scales::comma) +
scale_color_brewer(palette = "Set1") +
theme_minimal() +
labs(
title = "HIV Cases Trend by WHO Region",
x = "Year",
y = "People Living with HIV (Millions)",
color = "WHO Region"
) +
theme(
legend.position = "right",
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12)
)
print(regional_trend_plot)
# Identify countries contributing to 75% of the burden within each WHO region
# First, calculate the most recent year's data for each country
recent_hiv_data <- hiv_data_clean %>%
group_by(country, who_region) %>%
filter(year == max(year)) %>%
ungroup()
# Now calculate the regional burdens
regional_high_burden <- recent_hiv_data %>%
group_by(who_region) %>%
mutate(
regional_total = sum(hiv_cases, na.rm = TRUE),
percentage = hiv_cases / regional_total * 100
) %>%
arrange(who_region, desc(percentage)) %>%
group_by(who_region) %>%
mutate(cum_percentage = cumsum(percentage)) %>%
filter(cum_percentage <= 75) %>%
ungroup()
# Create a table showing high burden countries by region
regional_burden_table <- regional_high_burden %>%
select(who_region, country, hiv_cases, percentage, cum_percentage) %>%
mutate(
percentage = round(percentage, 2),
cum_percentage = round(cum_percentage, 2),
hiv_cases = format(hiv_cases, big.mark = ",")
)
# Display table by region
for (region in unique(regional_burden_table$who_region)) {
cat("\n### High Burden Countries in Region:", region, "\n")
region_table <- regional_burden_table %>%
filter(who_region == region) %>%
select(-who_region)
print(kable(region_table,
col.names = c("Country", "HIV Cases", "% of Regional Burden", "Cumulative %"),
caption = paste("Countries Contributing to 75% of HIV Burden in Region", region)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
}
##
## ### High Burden Countries in Region: AFR
## <table class="table table-striped table-hover table-condensed" style="margin-left: auto; margin-right: auto;">
## <caption>Countries Contributing to 75% of HIV Burden in Region AFR</caption>
## <thead>
## <tr>
## <th style="text-align:left;"> Country </th>
## <th style="text-align:left;"> HIV Cases </th>
## <th style="text-align:right;"> % of Regional Burden </th>
## <th style="text-align:right;"> Cumulative % </th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td style="text-align:left;"> Mauritania </td>
## <td style="text-align:left;"> 6,400,000 </td>
## <td style="text-align:right;"> 39.25 </td>
## <td style="text-align:right;"> 39.25 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Cabo Verde </td>
## <td style="text-align:left;"> 4,000,000 </td>
## <td style="text-align:right;"> 24.53 </td>
## <td style="text-align:right;"> 63.78 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Malawi </td>
## <td style="text-align:left;"> 980,000 </td>
## <td style="text-align:right;"> 6.01 </td>
## <td style="text-align:right;"> 69.79 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Ethiopia </td>
## <td style="text-align:left;"> 610,000 </td>
## <td style="text-align:right;"> 3.74 </td>
## <td style="text-align:right;"> 73.54 </td>
## </tr>
## </tbody>
## </table>
## ### High Burden Countries in Region: AMR
## <table class="table table-striped table-hover table-condensed" style="margin-left: auto; margin-right: auto;">
## <caption>Countries Contributing to 75% of HIV Burden in Region AMR</caption>
## <thead>
## <tr>
## <th style="text-align:left;"> Country </th>
## <th style="text-align:left;"> HIV Cases </th>
## <th style="text-align:right;"> % of Regional Burden </th>
## <th style="text-align:right;"> Cumulative % </th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td style="text-align:left;"> Suriname </td>
## <td style="text-align:left;"> 7,400,000 </td>
## <td style="text-align:right;"> 39.20 </td>
## <td style="text-align:right;"> 39.20 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Bahamas </td>
## <td style="text-align:left;"> 4,100,000 </td>
## <td style="text-align:right;"> 21.72 </td>
## <td style="text-align:right;"> 60.92 </td>
## </tr>
## </tbody>
## </table>
## ### High Burden Countries in Region: EMR
## <table class="table table-striped table-hover table-condensed" style="margin-left: auto; margin-right: auto;">
## <caption>Countries Contributing to 75% of HIV Burden in Region EMR</caption>
## <thead>
## <tr>
## <th style="text-align:left;"> Country </th>
## <th style="text-align:left;"> HIV Cases </th>
## <th style="text-align:right;"> % of Regional Burden </th>
## <th style="text-align:right;"> Cumulative % </th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td style="text-align:left;"> Somalia </td>
## <td style="text-align:left;"> 8,200,000 </td>
## <td style="text-align:right;"> 18.32 </td>
## <td style="text-align:right;"> 18.32 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Tunisia </td>
## <td style="text-align:left;"> 8,000,000 </td>
## <td style="text-align:right;"> 17.87 </td>
## <td style="text-align:right;"> 36.19 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Djibouti </td>
## <td style="text-align:left;"> 7,300,000 </td>
## <td style="text-align:right;"> 16.31 </td>
## <td style="text-align:right;"> 52.50 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Libya </td>
## <td style="text-align:left;"> 6,700,000 </td>
## <td style="text-align:right;"> 14.97 </td>
## <td style="text-align:right;"> 67.46 </td>
## </tr>
## </tbody>
## </table>
## ### High Burden Countries in Region: EUR
## <table class="table table-striped table-hover table-condensed" style="margin-left: auto; margin-right: auto;">
## <caption>Countries Contributing to 75% of HIV Burden in Region EUR</caption>
## <thead>
## <tr>
## <th style="text-align:left;"> Country </th>
## <th style="text-align:left;"> HIV Cases </th>
## <th style="text-align:right;"> % of Regional Burden </th>
## <th style="text-align:right;"> Cumulative % </th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td style="text-align:left;"> Israel </td>
## <td style="text-align:left;"> 9,600,000 </td>
## <td style="text-align:right;"> 12.04 </td>
## <td style="text-align:right;"> 12.04 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Georgia </td>
## <td style="text-align:left;"> 9,000,000 </td>
## <td style="text-align:right;"> 11.29 </td>
## <td style="text-align:right;"> 23.33 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Ireland </td>
## <td style="text-align:left;"> 8,600,000 </td>
## <td style="text-align:right;"> 10.79 </td>
## <td style="text-align:right;"> 34.11 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Estonia </td>
## <td style="text-align:left;"> 7,300,000 </td>
## <td style="text-align:right;"> 9.16 </td>
## <td style="text-align:right;"> 43.27 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Latvia </td>
## <td style="text-align:left;"> 6,800,000 </td>
## <td style="text-align:right;"> 8.53 </td>
## <td style="text-align:right;"> 51.80 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Denmark </td>
## <td style="text-align:left;"> 6,400,000 </td>
## <td style="text-align:right;"> 8.03 </td>
## <td style="text-align:right;"> 59.82 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Armenia </td>
## <td style="text-align:left;"> 6,300,000 </td>
## <td style="text-align:right;"> 7.90 </td>
## <td style="text-align:right;"> 67.72 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Czechia </td>
## <td style="text-align:left;"> 4,100,000 </td>
## <td style="text-align:right;"> 5.14 </td>
## <td style="text-align:right;"> 72.87 </td>
## </tr>
## </tbody>
## </table>
## ### High Burden Countries in Region: SEAR
## <table class="table table-striped table-hover table-condensed" style="margin-left: auto; margin-right: auto;">
## <caption>Countries Contributing to 75% of HIV Burden in Region SEAR</caption>
## <thead>
## <tr>
## <th style="text-align:left;"> Country </th>
## <th style="text-align:left;"> HIV Cases </th>
## <th style="text-align:right;"> % of Regional Burden </th>
## <th style="text-align:right;"> Cumulative % </th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td style="text-align:left;"> Sri Lanka </td>
## <td style="text-align:left;"> 4,700,000 </td>
## <td style="text-align:right;"> 51.21 </td>
## <td style="text-align:right;"> 51.21 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> Timor-Leste </td>
## <td style="text-align:left;"> 1,700,000 </td>
## <td style="text-align:right;"> 18.52 </td>
## <td style="text-align:right;"> 69.74 </td>
## </tr>
## </tbody>
## </table>
## ### High Burden Countries in Region: WPR
## <table class="table table-striped table-hover table-condensed" style="margin-left: auto; margin-right: auto;">
## <caption>Countries Contributing to 75% of HIV Burden in Region WPR</caption>
## <thead>
## <tr>
## <th style="text-align:left;"> Country </th>
## <th style="text-align:left;"> HIV Cases </th>
## <th style="text-align:right;"> % of Regional Burden </th>
## <th style="text-align:right;"> Cumulative % </th>
## </tr>
## </thead>
## <tbody>
## <tr>
## <td style="text-align:left;"> Singapore </td>
## <td style="text-align:left;"> 8,100,000 </td>
## <td style="text-align:right;"> 51.60 </td>
## <td style="text-align:right;"> 51.60 </td>
## </tr>
## <tr>
## <td style="text-align:left;"> New Zealand </td>
## <td style="text-align:left;"> 3,600,000 </td>
## <td style="text-align:right;"> 22.93 </td>
## <td style="text-align:right;"> 74.54 </td>
## </tr>
## </tbody>
## </table>
# Create visualization for each WHO region showing high burden countries
# Extract data for high burden countries by region
regional_trends_data <- hiv_data_clean %>%
inner_join(regional_high_burden %>% select(who_region, country), by = c("who_region", "country"))
# Create plots for each region
for (region in unique(regional_high_burden$who_region)) {
region_data <- regional_trends_data %>%
filter(who_region == region)
region_plot <- region_data %>%
ggplot(aes(x = year, y = hiv_cases/1000000, color = country)) +
geom_line(linewidth = 1) +
geom_point(size = 2, alpha = 0.7) +
scale_y_continuous(labels = scales::comma) +
scale_color_viridis_d() +
theme_minimal() +
labs(
title = paste("HIV Cases Trend in", region, "Region"),
subtitle = "Countries contributing to 75% of regional burden",
x = "Year",
y = "People Living with HIV (Millions)",
color = "Country"
) +
theme(
legend.position = "right",
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12)
)
print(region_plot)
}
# Create faceted plot by region with countries contributing to 75% of regional burden
regional_trends_data %>%
ggplot(aes(x = year, y = hiv_cases/1000000, color = country)) +
geom_line(linewidth = 0.8) +
geom_point(size = 1.5, alpha = 0.7) +
facet_wrap(~who_region, scales = "free_y") +
scale_y_continuous(labels = scales::comma) +
scale_color_viridis_d() +
theme_minimal() +
labs(
title = "HIV Cases Trend by WHO Region",
subtitle = "Countries contributing to 75% of regional burden",
x = "Year",
y = "People Living with HIV (Millions)"
) +
theme(
legend.position = "bottom",
legend.title = element_blank(),
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
strip.text = element_text(face = "bold"),
strip.background = element_rect(fill = "lightgray", color = NA)
)
## Relationship Between HIV and Multidimensional Poverty
Let’s merge the HIV data with the multidimensional poverty data:
# Merge HIV and poverty data
# Since poverty data's reporting_year may not match exactly with HIV data's year,
# we'll need to make some adjustments
# First, let's get the most recent HIV data for each country
recent_hiv_data <- hiv_data_clean %>%
group_by(country) %>%
filter(year == max(year)) %>%
ungroup()
# Now merge with poverty data
merged_data <- recent_hiv_data %>%
inner_join(poverty_data_clean, by = "country")
# Check the merged data
cat("Number of countries in merged dataset:", nrow(merged_data), "\n")
## Number of countries in merged dataset: 79
# Check for missing values after merge
missing_values <- merged_data %>%
summarize(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "missing_count") %>%
filter(missing_count > 0)
print(missing_values)
## # A tibble: 4 Ă— 2
## variable missing_count
## <chr> <int>
## 1 school_enrollment 23
## 2 electricity 1
## 3 sanitation 16
## 4 drinking_water 6
# Create scatterplot of HIV cases vs. poverty ratio
hiv_poverty_plot <- merged_data %>%
ggplot(aes(x = poverty_ratio, y = hiv_cases/1000000)) +
geom_point(aes(color = who_region, size = hiv_cases), alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
scale_y_continuous(labels = scales::comma) +
scale_size_continuous(guide = "none") +
theme_minimal() +
labs(
title = "Relationship Between HIV Cases and Multidimensional Poverty",
x = "Multidimensional Poverty Headcount Ratio (%)",
y = "People Living with HIV (Millions)",
color = "WHO Region"
) +
theme(
legend.position = "right",
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12)
)
print(hiv_poverty_plot)
# Calculate and save correlations
correlation_analysis <- merged_data %>%
summarize(
correlation_poverty_ratio = cor(hiv_cases, poverty_ratio, use = "complete.obs"),
correlation_monetary = cor(hiv_cases, monetary, use = "complete.obs"),
correlation_education = cor(hiv_cases, education_attainment, use = "complete.obs"),
correlation_school = cor(hiv_cases, school_enrollment, use = "complete.obs"),
correlation_electricity = cor(hiv_cases, electricity, use = "complete.obs"),
correlation_sanitation = cor(hiv_cases, sanitation, use = "complete.obs"),
correlation_water = cor(hiv_cases, drinking_water, use = "complete.obs")
)
# Convert correlations to a more readable format
correlation_tidy <- correlation_analysis %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "correlation"
) %>%
mutate(
variable = gsub("correlation_", "", variable),
variable = gsub("_", " ", variable),
variable = stringr::str_to_title(variable)
)
# Create bar plot of correlations
correlation_plot <- ggplot(correlation_tidy, aes(x = reorder(variable, correlation), y = correlation)) +
geom_col(fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(
title = "Correlation between HIV Cases and Poverty Factors",
x = "Poverty Factor",
y = "Correlation Coefficient"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12)
)
print(correlation_plot)
Let’s account for random effects (country, region) in our analysis:
# Prepare data for mixed effects model
model_data <- merged_data %>%
# Select relevant columns and remove NAs
select(country, who_region, hiv_cases,
poverty_ratio, monetary, education_attainment,
school_enrollment, electricity, sanitation, drinking_water) %>%
filter(complete.cases(.))
# Log-transform HIV cases for better model fit
model_data <- model_data %>%
mutate(log_hiv_cases = log(hiv_cases + 1))
# Scale variables for better model convergence
model_data_scaled <- model_data %>%
mutate(across(c(poverty_ratio, monetary, education_attainment,
school_enrollment, electricity, sanitation, drinking_water),
scale))
# Fit mixed effects model with country and region as random effects
mixed_model <- try(
lmer(log_hiv_cases ~
poverty_ratio + monetary + education_attainment +
school_enrollment + electricity + sanitation + drinking_water +
(1|who_region),
data = model_data_scaled),
silent = TRUE
)
# Load the package
library(glmmTMB)
# Check if model converged
if(!inherits(mixed_model, "try-error")) {
# Model summary
summary_model <- summary(mixed_model)
print(summary_model)
# Plot the model results
plot_model(mixed_model, type = "est", sort.est = TRUE) +
labs(title = "Mixed Effects Model: Factors Affecting HIV Cases",
subtitle = "Accounting for region random effects")
# Variance explained by random effects
plot_model(mixed_model, type = "re") +
labs(title = "Random Effects by WHO Region")
# Check model diagnostics
plot_model(mixed_model, type = "diag")
} else {
# If the model fails, try a simpler version
cat("Full model did not converge. Trying a simpler model.\n")
simple_model <- lm(log_hiv_cases ~
poverty_ratio + monetary + education_attainment +
school_enrollment + electricity + sanitation + drinking_water,
data = model_data_scaled)
summary_simple <- summary(simple_model)
print(summary_simple)
# Plot coefficients
plot_model(simple_model, type = "est", sort.est = TRUE) +
labs(title = "Linear Model: Factors Affecting HIV Cases")
}
## Linear mixed model fit by REML ['lmerMod']
## Formula: log_hiv_cases ~ poverty_ratio + monetary + education_attainment +
## school_enrollment + electricity + sanitation + drinking_water +
## (1 | who_region)
## Data: model_data_scaled
##
## REML criterion at convergence: 223.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.89075 -0.67078 -0.02105 0.67203 2.10069
##
## Random effects:
## Groups Name Variance Std.Dev.
## who_region (Intercept) 1.213 1.101
## Residual 4.467 2.113
## Number of obs: 54, groups: who_region, 6
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 11.94256 0.59095 20.209
## poverty_ratio -1.88010 3.52584 -0.533
## monetary -0.87840 1.47788 -0.594
## education_attainment 1.27994 0.83832 1.527
## school_enrollment 0.09196 0.80406 0.114
## electricity 1.67694 1.14806 1.461
## sanitation -0.67444 0.80804 -0.835
## drinking_water -0.07581 0.66910 -0.113
##
## Correlation of Fixed Effects:
## (Intr) pvrty_ montry edctn_ schl_n elctrc santtn
## poverty_rat 0.105
## monetary -0.056 -0.909
## edctn_ttnmn -0.015 -0.658 0.639
## schl_nrllmn -0.082 -0.697 0.675 0.151
## electricity -0.102 -0.715 0.461 0.395 0.482
## sanitation 0.009 -0.268 0.233 -0.040 0.132 -0.085
## drinkng_wtr 0.039 -0.578 0.538 0.506 0.273 0.348 -0.056
## [[1]]
##
## [[2]]
## [[2]]$who_region
##
##
## [[3]]
##
## [[4]]
This analysis has explored:
Key findings:
Regional Trends:
- Europe (EUR): Steady upward trends, especially in
Israel and Ireland.
- Eastern Mediterranean (EMR): Dramatic rises
post-2015, with Somalia and Djibouti spiking sharply.
Africa (AFR): Surprisingly lower case numbers despite higher poverty levels.
Southeast Asia (SEAR) & Western Pacific (WPR): More moderate, stable growth patterns.
Poverty and HIV Relationship:
- All poverty-related factors show a weak negative correlation with HIV
cases (ranging from -0.20 to -0.15).
- Strongest negative links: overall poverty ratio, monetary poverty,
sanitation, and electricity access.
- Moderate links: education, school enrollment, and water access.
Insights:
- Countries with better living standards report higher HIV
cases.
- Possible reasons:
- Better testing and reporting systems
- Higher survival rates due to advanced healthcare
- Stronger public health surveillance
Summary:
The relationship between poverty and HIV is more complex than expected.
Strong healthcare systems and reporting capabilities may explain why
wealthier countries show higher HIV numbers. This highlights the need
for region-specific strategies that go beyond economic measures,
focusing on healthcare access, surveillance, and tailored
interventions.
# Load required libraries
library(tidyverse)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(viridis)
library(knitr)
# Read the dataset
mortality_data <- read.csv("C:/Users/PC/Downloads/dataset_datascience.csv",
stringsAsFactors = FALSE)
# East African Community countries
eac_countries <- c("Burundi", "Kenya", "Rwanda", "South Sudan", "Tanzania",
"Uganda", "Democratic Republic of the Congo", "Somalia")
# Filter for East African Community countries and included estimates
eac_data <- mortality_data %>%
filter(Geographic.area %in% eac_countries) %>%
filter(Observation.Status == "Included in IGME") %>%
# Creates a proper year column from Reference.Date
mutate(Year = floor(Reference.Date))
# Splits data by indicator
under_five_data <- eac_data %>%
filter(Indicator == "Under-five mortality rate")
neonatal_data <- eac_data %>%
filter(Indicator == "Neonatal mortality rate")
# Gets latest data for each country for mapping
latest_under_five <- under_five_data %>%
group_by(Geographic.area) %>%
arrange(desc(Year)) %>%
slice(1) %>%
ungroup()
latest_neonatal <- neonatal_data %>%
group_by(Geographic.area) %>%
arrange(desc(Year)) %>%
slice(1) %>%
ungroup()
# Checking for countries with data available
cat("EAC countries with under-five mortality data:",
paste(unique(latest_under_five$Geographic.area), collapse=", "), "\n")
## EAC countries with under-five mortality data: Burundi, Democratic Republic of the Congo, Kenya, Rwanda, Somalia, South Sudan, Uganda
cat("EAC countries with neonatal mortality data:",
paste(unique(latest_neonatal$Geographic.area), collapse=", "), "\n")
## EAC countries with neonatal mortality data: Burundi, Democratic Republic of the Congo, Kenya, Rwanda, Somalia, South Sudan, Uganda
# Get world map data
world <- ne_countries(scale = "medium", returnclass = "sf")
# Filter for East African countries
eac_map <- world %>%
filter(name %in% eac_countries | sovereignt %in% eac_countries)
# Checking if all countries have been found
missing_countries <- setdiff(eac_countries,
c(eac_map$name, eac_map$sovereignt))
if(length(missing_countries) > 0) {
warning(paste("Missing countries in map data:",
paste(missing_countries, collapse=", ")))
}
# Merge map data with mortality data
# Adjust the joining based on how country names appear in both datasets
eac_under_five_map <- eac_map %>%
left_join(latest_under_five, by = c("name" = "Geographic.area"))
eac_neonatal_map <- eac_map %>%
left_join(latest_neonatal, by = c("name" = "Geographic.area"))
# If name doesn't match, try with sovereign name
eac_under_five_map <- eac_under_five_map %>%
left_join(latest_under_five %>%
filter(!(Geographic.area %in% eac_map$name)),
by = c("sovereignt" = "Geographic.area"))
eac_neonatal_map <- eac_neonatal_map %>%
left_join(latest_neonatal %>%
filter(!(Geographic.area %in% eac_map$name)),
by = c("sovereignt" = "Geographic.area"))
# function to plot the maps
plot_mortality_map <- function(data, title, legend_title) {
ggplot(data) +
geom_sf(aes(fill = Observation.Value.x), color = "white", size = 0.2) +
scale_fill_viridis(option = "magma",
name = legend_title,
direction = -1,
na.value = "grey80") + # Added handling for NA values
labs(title = title) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
legend.position = "right",
plot.margin = margin(10, 10, 10, 10)
)
}
# Plot under-five mortality map
under_five_map <- plot_mortality_map(
eac_under_five_map,
"Latest Under-Five Mortality Rate in East African Community",
"Deaths per 1,000\nlive births"
)
# Plot neonatal mortality map
neonatal_map <- plot_mortality_map(
eac_neonatal_map,
"Latest Neonatal Mortality Rate in East African Community",
"Deaths per 1,000\nlive births"
)
# Combine maps
gridExtra::grid.arrange(under_five_map, neonatal_map, ncol = 2)
# Function for trend plots
plot_trend <- function(data, title, y_label) {
# Calculate average trend
avg_trend <- data %>%
group_by(Year) %>%
summarize(avg_value = mean(Observation.Value, na.rm = TRUE))
# Creating plot
ggplot() +
# Plot individual country data
geom_line(data = data,
aes(x = Year, y = Observation.Value, color = Geographic.area),
alpha = 0.7) +
geom_point(data = data,
aes(x = Year, y = Observation.Value, color = Geographic.area),
alpha = 0.7) +
# Add average trend line
geom_line(data = avg_trend,
aes(x = Year, y = avg_value),
linetype = "dashed", linewidth = 1.2, color = "black") +
# Labels and styling
labs(
title = title,
x = "Year",
y = y_label,
color = "Country"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
legend.position = "right"
)
}
# Under-five mortality trend
under_five_trend <- plot_trend(
under_five_data,
"Under-Five Mortality Rate Trends in East African Community",
"Deaths per 1,000 live births"
)
# Neonatal mortality trend
neonatal_trend <- plot_trend(
neonatal_data,
"Neonatal Mortality Rate Trends in East African Community",
"Deaths per 1,000 live births"
)
# Display trend plots
gridExtra::grid.arrange(under_five_trend, neonatal_trend, ncol = 2)
# Identify countries with highest under-five mortality rates
highest_under_five <- latest_under_five %>%
arrange(desc(Observation.Value)) %>%
select(Geographic.area, Observation.Value, Year)
# Identify countries with highest neonatal mortality rates
highest_neonatal <- latest_neonatal %>%
arrange(desc(Observation.Value)) %>%
select(Geographic.area, Observation.Value, Year)
# Display results
cat("East African countries with highest under-five mortality rates:\n")
## East African countries with highest under-five mortality rates:
kable(highest_under_five,
col.names = c("Country", "Under-five mortality rate", "Year of estimate"),
digits = 1)
| Country | Under-five mortality rate | Year of estimate |
|---|---|---|
| Somalia | 129.6 | 2003 |
| Democratic Republic of the Congo | 82.3 | 2014 |
| Burundi | 74.2 | 2013 |
| Uganda | 55.9 | 2015 |
| South Sudan | 45.6 | 2007 |
| Kenya | 38.5 | 2021 |
| Rwanda | 34.1 | 2022 |
cat("\nEast African countries with highest neonatal mortality rates:\n")
##
## East African countries with highest neonatal mortality rates:
kable(highest_neonatal,
col.names = c("Country", "Neonatal mortality rate", "Year of estimate"),
digits = 1)
| Country | Neonatal mortality rate | Year of estimate |
|---|---|---|
| South Sudan | 52.0 | 2006 |
| Somalia | 37.5 | 2003 |
| Democratic Republic of the Congo | 27.5 | 2010 |
| Uganda | 25.0 | 2013 |
| Burundi | 23.7 | 2013 |
| Kenya | 21.1 | 2019 |
| Rwanda | 20.0 | 2016 |
The analysis of under-five and neonatal mortality rates across the East African Community reveals significant progress, with a consistent downward trend in mortality rates over time. However, this improvement masks substantial disparities between member states:
Highest mortality rates: South Sudan shows the most concerning figures in recent data, while Somalia and the Democratic Republic of the Congo demonstrated similarly elevated rates in earlier periods. Leading performers: Kenya and Rwanda have achieved markedly lower mortality rates, suggesting effective healthcare policies and interventions that could serve as regional models.
These considerable variations highlight differing levels of healthcare infrastructure, policy effectiveness, and resource allocation across the region. The temporal disparity in latest available data points (ranging from 2003 to 2022) adds complexity to direct comparisons but doesn’t diminish the clear pattern of regional inequality. These findings point to specific opportunities for regional cooperation:
The overall positive trend is encouraging, but achieving equitable child health outcomes across the EAC will require coordinated regional strategies, continued investment in healthcare systems, and data-driven decision-making. Regular monitoring and evaluation remain essential to track progress and adjust interventions accordingly.